# nolint start
# Author: T. M. Jensen Ahokovi

# 1 · Setup -------------------------------------------------------------------

rm(list = ls())
set.seed(123)

# uncomment the line below and add working directory path between the quotation marks
# dir_master <- "ADD WORKING DIRECTORY HERE/"

dir_data <- file.path(dir_master, "Data/")
dir_code <- file.path(dir_master, "Do/")
dir_results <- file.path(dir_master, "Results/")
dir_charts <- file.path(dir_master, "Charts/")

library(tidyverse)   
library(haven)       
library(AER)         
library(clubSandwich) 
library(broom)       
library(boot)        
library(stargazer)   

# 2 · Helper Functions --------------------------------------------------------

#' Perform leave-one-out prediction at state level for 2SLS first stage
#' 
#' @param data Data frame containing the analysis dataset
#' @param formula1 Formula for first model (without normal vote control)
#' @param formula2 Formula for second model (with normal vote control)
#' @return Data frame with original data plus predicted values
predict_for_all_states <- function(data, formula1, formula2) {
  data %>%
    #// Group the data by state to perform leave-one-out at the state level
    group_by(state) %>%
    #// Use group_modify to apply our prediction process to each state group
    group_modify(~{
      #// Create training data by excluding the current state
      #// This is the core of the leave-one-out method: we're creating a model using all states except the current one
      temp_train_data <- data %>% filter(state != .y$state)
      
      #// Fit models on training data
      #// We're creating two separate models here, with and without the normal vote control
      model1 <- lm(formula1, data = temp_train_data)
      model2 <- lm(formula2, data = temp_train_data)
      
      #// Make predictions for the left-out state
      #// We use the models trained on other states to predict values for the current state
      .x %>%
        mutate(
          #// Predict using the first model (without norm_vote_top2_share_sd)
          predicted_total_1k_muni_aid_per_resident_1 = predict(model1, newdata = .x),
          #// Predict using the second model (with norm_vote_top2_share_sd)
          predicted_total_1k_muni_aid_per_resident_2 = predict(model2, newdata = .x)
        )
    }) %>%
    #// After processing all states, we ungroup the data
    ungroup()
}

#' Perform cluster sampling for bootstrap procedure
#' 
#' @param data Data frame input
#' @param clusters Vector of cluster identifiers (states)
#' @return Vector of indices for cluster-sampled observations
cluster_sample <- function(data, clusters) {
  #// Get unique clusters (in this case, states)
  #// This ensures we're sampling at the cluster level, not the individual observation level
  unique_clusters <- unique(clusters)
  
  #// Sample clusters with replacement
  #// This mimics the process of randomly selecting states, allowing for repeats
  #// It's key to bootstrapping: we're creating a new "population" of states from our sample
  sampled_clusters <- sample(unique_clusters, replace = TRUE)
  
  #// Get indices of all observations in sampled clusters
  #// We're now translating our sampled states back into the indices of individual observations
  #// This maintains the within-cluster correlation structure
  indices <- unlist(lapply(sampled_clusters, function(c) which(clusters == c)))
  
  return(indices)
}

#' Bootstrap function for first model (baseline)
#' 
#' @param data Data frame with predicted values
#' @param indices Bootstrap sample indices
#' @return Model coefficients
boot_fn1 <- function(data, indices) {
  #// Subset the data using the provided indices
  #// These indices come from our cluster_sample function, maintaining the clustered structure
  temp_bootstrap_data <- data[indices, ]
  
  #// Fit the model on this bootstrap sample
  #// We're using the predicted values from our first-stage leave-one-out process
  temp_fit <- lm(incumbent_vote_share_top2 ~ predicted_total_1k_muni_aid_per_resident_1, 
                 data = temp_bootstrap_data)
  
  #// Return the coefficients
  #// These coefficients are what we'll use to estimate our standard errors
  return(coef(temp_fit))
}

#' Bootstrap function for second model (baseline + normal vote)
#' 
#' @param data Data frame with predicted values
#' @param indices Bootstrap sample indices
#' @return Model coefficients
boot_fn2 <- function(data, indices) {
  #// Subset the data using the provided indices
  temp_bootstrap_data <- data[indices, ]
  
  #// Fit the model on this bootstrap sample
  temp_fit <- lm(incumbent_vote_share_top2 ~ predicted_total_1k_muni_aid_per_resident_2 + norm_vote_top2_share_sd, 
                 data = temp_bootstrap_data)
  
  #// Return the coefficients
  return(coef(temp_fit))
}

# 3 · Load Data ---------------------------------------------------------------

## 3.1 · Import Main Dataset

#// Load main dataset from Stata file
df_main <- read_dta(file.path(dir_data, "Processing/state_elections_panel_df5_20240731.dta"))

# 4 · Data Processing ---------------------------------------------------------

## 4.1 · Temporal Filtering

#// Subset data for years 2020-2022
df_subset <- df_main %>%
  filter(year >= 2020 & year <= 2022)

## 4.2 · Variable Selection

#// Select relevant variables for analysis
df_analysis <- df_subset %>%
  select(year, state, office, sfips, special_election, 
         total_1k_muni_aid_per_resident, reps_per_million, 
         norm_vote_top2_share_sd, incumbent_vote_share_top2)

# 5 · Analysis ----------------------------------------------------------------

## 5.1 · Leave-One-Out First Stage Estimation

#// Define formulas for prediction
#// These formulas represent our first stage in the 2SLS setup
formula1 <- total_1k_muni_aid_per_resident ~ reps_per_million
formula2 <- total_1k_muni_aid_per_resident ~ reps_per_million + norm_vote_top2_share_sd

#// Run prediction for all states and time the operation
#// This applies our leave-one-out function to the entire dataset
system.time({
  df_with_predictions <- predict_for_all_states(df_analysis, formula1, formula2)
})

## 5.2 · Second Stage Model Estimation

#// Fit second stage models using predicted values from first stage
model_ss1 <- lm(incumbent_vote_share_top2 ~ predicted_total_1k_muni_aid_per_resident_1, 
                data = df_with_predictions)
model_ss2 <- lm(incumbent_vote_share_top2 ~ predicted_total_1k_muni_aid_per_resident_2 + norm_vote_top2_share_sd, 
                data = df_with_predictions)

## 5.3 · Clustered Bootstrap Standard Errors

### 5.3.1 · Bootstrap for First Model

#// Perform bootstrap with 1000 replications for first model
boot_results1 <- boot(
  data = df_with_predictions,  #// Use our final dataset that includes the leave-one-out predictions
  statistic = boot_fn1,        #// Use the function we defined for the first model
  R = 1000,                    #// Number of bootstrap replications
  sim = "parametric",          #// Use parametric simulation
  ran.gen = function(data, p) data[cluster_sample(data, data$state), ]  #// Use our custom cluster sampling function
)

### 5.3.2 · Bootstrap for Second Model

#// Perform bootstrap with 1000 replications for second model
#// This follows the same process as for the first model
boot_results2 <- boot(
  data = df_with_predictions, 
  statistic = boot_fn2,        #// Use the function for the second model
  R = 1000, 
  sim = "parametric", 
  ran.gen = function(data, p) data[cluster_sample(data, data$state), ]
)

### 5.3.3 · Calculate Bootstrap Standard Errors

#// Calculate bootstrapped standard errors
#// We're taking the standard deviation of our bootstrapped coefficients
#// This gives us our estimate of the standard error for each coefficient
boot_se1 <- apply(boot_results1$t, 2, sd)
boot_se2 <- apply(boot_results2$t, 2, sd)

# 6 · Results and Output ------------------------------------------------------

## 6.1 · Extract Model Coefficients

#// Extract coefficients from original models
coef1 <- coef(model_ss1)
coef2 <- coef(model_ss2)

## 6.2 · Generate Results Table

#// Generate regression table with stargazer
stargazer(model_ss1, model_ss2,
          se = list(boot_se1, boot_se2),  #// Use bootstrapped standard errors instead of model-based SEs
          title = "Baseline Results with Leave-One-Out Predictions",
          type = "text",  #// Output as text (could be "latex" or "html" for different formats)
          keep.stat = c("n", "rsq"),  #// Keep only sample size (n) and R-squared in the output
          align = TRUE,  #// Align decimal points
          no.space = TRUE,  #// Remove extra spacing
          column.labels = c("Baseline", "Baseline + Norm. Vote")  #// Labels for each model column
)

# nolint end